First, we need to load in the necessary libraries:
library(arules)
library(arulesViz)
library(dplyr)
library(plotly)
library(data.table)
Then we need to load our dataset, as well as examine it a bit
data("AdultUCI")
dim(AdultUCI)
#> [1] 48842 15
AdultUCI
Next lets get rid of some columns we are not going to use/that has some issues, and then lets break down our numeric variables into factors using cut
data <- AdultUCI[, -c(3, 5, 11:12)]
colnames(data)[colnames(data) == "hours-per-week"] <- "hoursperweek"
data$age <- cut(data$age, breaks = c(15, 25, 45, 65, 100), labels = c("Young",
"Middleaged", "Senior", "Retired"))
data$hoursperweek <- cut(data$hoursperweek, breaks = c(0, 20, 40, 60, 80), labels = c("part-time",
"full-time", "hard-working", "need-a-life"))
str(data)
#> 'data.frame': 48842 obs. of 11 variables:
#> $ age : Factor w/ 4 levels "Young","Middleaged",..: 2 3 2 3 2 2 3 3 2 2 ...
#> $ workclass : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
#> $ education : Ord.factor w/ 16 levels "Preschool"<"1st-4th"<..: 14 14 9 7 14 15 5 9 15 14 ...
#> $ marital-status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
#> $ occupation : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
#> $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
#> $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
#> $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
#> $ hoursperweek : Factor w/ 4 levels "part-time","full-time",..: 2 1 2 2 2 2 1 3 3 2 ...
#> $ native-country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
#> $ income : Ord.factor w/ 2 levels "small"<"large": 1 1 1 1 1 1 1 2 2 2 ...
Next, we convert the data to an object with class transactions, after viewing it again
data
data <- as(data, "transactions")
summary(data)
#> transactions as itemMatrix in sparse format with
#> 48842 rows (elements/itemsets/transactions) and
#> 109 columns (items) and a density of 0.09658517
#>
#> most frequent items:
#> native-country=United-States race=White
#> 43832 41762
#> workclass=Private sex=Male
#> 33906 32650
#> hoursperweek=full-time (Other)
#> 30037 332011
#>
#> element (itemset/transaction) length distribution:
#> sizes
#> 7 8 9 10 11
#> 27 974 2160 15714 29967
#>
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 7.00 10.00 11.00 10.53 11.00 11.00
#>
#> includes extended item information - examples:
#> labels variables levels
#> 1 age=Young age Young
#> 2 age=Middleaged age Middleaged
#> 3 age=Senior age Senior
#>
#> includes extended transaction information - examples:
#> transactionID
#> 1 1
#> 2 2
#> 3 3
Before we begin with our analysis, lets check out the rule frequencies within the dataset. We are looking for rules with support >= .2
itemFrequencyPlot(data, support = 0.2)
Next, lets mine some rules with the apriori algorithm, and then clean up redundant rules. We are still sorting out what to set the minsupp and minconf to.
zerules <- apriori(data, parameter = list(minlen = 2, supp = 0.2, conf = 0.3),
appearance = list(rhs = c("income=small", "income=large"), default = "lhs"),
control = list(verbose = F))
length(zerules)
#> [1] 23
redundant <- is.redundant(zerules)
zerules.pruned <- zerules[redundant == FALSE]
rulesorted <- sort(zerules.pruned, by = "lift", decreasing = TRUE)
length(rulesorted)
#> [1] 8
Next, let us inspect the rules, and examine their quality
(quality(rulesorted))
inspectDT(rulesorted)
First lets view a scatterplot of our rules
plot(rulesorted, method = "scatterplot", measure = c("confidence", "support"),
shading = "lift", engine = "htmlwidget")
Next lets look at a balloon plot
plot(rulesorted, method = "graph", measure = "confidence", shading = "lift",
engine = "htmlwidget")
Parallel plot
plot(rulesorted, method = "paracoord", measure = "confidence", shading = "lift",
control = list(reorder = T))
Two key plot
plot(rulesorted, method = "two-key plot", measure = "confidence", shading = "lift",
engine = "htmlwidget")
grouped plot
plot(rulesorted, method = "grouped", measure = "confidence", shading = "lift")
rule2 <- apriori(data, parameter = list(supp = 0.01, conf = 0.5), appearance = list(rhs = c("income=small",
"income=large"), default = "lhs"), control = list(verbose = F))
length(rule2)
#> [1] 4115
redundant <- is.redundant(rule2)
rulep <- rule2[redundant == FALSE]
rulesorted2 <- sort(rulep, by = "lift", decreasing = TRUE)
length(rulesorted2)
#> [1] 725
head(quality(rulesorted2))
inspectDT(rulesorted2)
plot(rulesorted2, method = "scatterplot", measure = c("confidence", "support"),
shading = "lift", engine = "htmlwidget")
plot(rulesorted2, method = "graph", measure = "confidence", shading = "lift",
engine = "htmlwidget")
plot(rulesorted2, method = "two-key plot", measure = "confidence", shading = "lift",
engine = "htmlwidget")
plot(rulesorted2, method = "grouped", measure = "confidence", shading = "lift")